Load required libraries.

library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(grid)
library(wordbankr)
library(langcog)
theme_set(theme_mikabr())

Load in Wordbank data.

items <- get_item_data() %>%
  filter(type == "word") %>%
  mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))))

Get vocabulary composition data for all languages.

get_vocab_comp <- function(input_language, input_form) {
  
  lang_vocab_items <- filter(items, language == input_language, form == input_form) %>%
    filter(lexical_category %in% c("nouns", "predicates", "function_words"))
  
  lang_vocab_data <- get_instrument_data(instrument_language = input_language,
                                         instrument_form = input_form,
                                         items = lang_vocab_items$item_id,
                                         iteminfo = lang_vocab_items) %>%
    mutate(value = ifelse(is.na(value), "", value),
           produces = value == "produces",
           understands = value == "produces" | value == "understands") %>%
    select(-value) %>%
    gather(measure, value, produces, understands)
  
  num_words <- nrow(lang_vocab_items)
  
  lang_vocab_summary <- lang_vocab_data %>%
    group_by(data_id, measure, lexical_category) %>%
    summarise(num_true = sum(value),
              prop = sum(value) / n())
  
  lang_vocab_sizes <- lang_vocab_summary %>%
    summarise(vocab = sum(num_true) / num_words)
  
  lang_vocab_summary %>%
    left_join(lang_vocab_sizes) %>%
    select(-num_true) %>%
    mutate(language = input_language, form = input_form)
  
}
instruments <- items %>%
  select(language, form) %>%
  distinct()

vocab_comp_data <- map2(instruments$language, instruments$form, get_vocab_comp) %>%
  bind_rows()

Show sample size of each instrument.

sample_sizes <- vocab_comp_data %>%
  group_by(language, form, measure, lexical_category) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  select(language, form, n) %>%
  distinct()
kable(sample_sizes)
language form n
British Sign Language WG 161
Cantonese WS 987
Croatian WG 250
Croatian WS 377
Danish WS 3714
English WG 2454
English WS 5824
German WS 1183
Hebrew WG 62
Hebrew WS 253
Italian WG 648
Italian WS 752
Mandarin TC 652
Mandarin WS 1056
Norwegian WG 3025
Norwegian WS 12969
Russian WG 768
Russian WS 1037
Spanish WG 778
Spanish WS 1094
Swedish WG 474
Swedish WS 900
Turkish WG 1115
Turkish WS 2422

Base plot for looking at vocabulary composition.

base_plot <- function(input_form, input_measure) {
  vocab_comp_data %>%
    filter(form == input_form, measure == input_measure) %>%
    mutate(lexical_category = factor(lexical_category,
                                     levels = c("nouns", "predicates", "function_words"),
                                     labels = c("Nouns", "Predicates", "Function Words"))) %>%
    ggplot(aes(x = vocab, y = prop, colour = lexical_category)) +
    facet_wrap(~language) +
    geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") + 
    scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                       name = "Proportion of Category\n") +
    scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                       name = "\nVocabulary Size") +
    scale_color_solarized(name = "Lexical Category") +
    theme(legend.position = c(0.068, 0.95),
          legend.text = element_text(size = 9),
          legend.title = element_text(size = 9, lineheight = unit(0.8, "char")),
          legend.key.height = unit(0.8, "char"),
          legend.key.width = unit(0.3, "cm"),
          legend.key = element_blank(),
          legend.background = element_rect(fill = "transparent"))
}

Plot WS productive vocabulary composition as a function of vocabulary size for each language.

base_plot("WS", "produces") + geom_jitter(size = 0.7)

Plot WG productive vocabulary composition as a function of vocabulary size for each language.

base_plot("WG", "produces") + geom_jitter(size = 0.7)

Plot WG receptive vocabulary composition as a function of vocabulary size for each language.

base_plot("WG", "understands") + geom_jitter(size = 0.7)

Plot WS productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WS", "produces") +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)

Plot WG productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WG", "produces") +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)

Plot WG receptive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WG", "understands") +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)

Function for resampling data.

sample_areas <- function(d, nboot = 1000) {
  
  poly_area <- function(group_data) {
    model = clm(prop ~ I(vocab ^ 3) + I(vocab ^ 2) + vocab - 1,
                data = group_data)
    return((model$solution %*% c(1/4, 1/3, 1/2) - 0.5)[1])
  }
  
  counter <- 1
  sample_area <- function(d) {
    d_frame <- d %>%
      group_by(language, form, measure) %>%
      sample_frac(replace = TRUE) %>%
      group_by(language, form, measure, lexical_category) %>%
      do(area = poly_area(.)) %>%
      mutate(area = area[1]) %>%
      rename_(.dots = setNames("area", counter))
    
    counter <<- counter + 1 # increment counter outside scope
    return(d_frame)
  }
  
  areas <- replicate(nboot, sample_area(d), simplify = FALSE)
  
  Reduce(left_join, areas) %>%
    gather(sample, area, -language, -form, -measure, -lexical_category)
}

Resample data, compute area for each sample, find the mean and CI of the area estimate.

areas <- sample_areas(vocab_comp_data, 1000)

area_summary <- areas %>%
  group_by(language, form, measure, lexical_category) %>%
  summarise(mean =  mean(area),
            ci_lower = ci_lower(area),
            ci_upper = ci_upper(area)) %>%
  ungroup() %>%
  mutate(language = factor(language),
         instrument = paste(language, form))

area_order <- filter(area_summary, form == "WS", measure == "produces",
                     lexical_category == "nouns")
language_levels <- area_order$language[order(area_order$mean,
                                             area_order$language,
                                             decreasing = FALSE)]

area_summary_ordered <- area_summary %>%
  filter(form %in% c("WS", "WG"),
         !(form == "WS" & measure == "understands")) %>%
  ungroup() %>%
  mutate(language = factor(language, levels = language_levels),
         lexical_category = factor(lexical_category,
                                   levels = c("nouns", "predicates", "function_words"),
                                   labels = c("Nouns", "Predicates", "Function Words")))

Plot each lexical category’s area estimate by language, form, and measure.

ggplot(area_summary_ordered,
       aes(y = language, x = mean, colour = lexical_category)) +
  facet_grid(lexical_category ~ form + measure) +
  geom_point() +
  geom_segment(aes(x = ci_lower, xend = ci_upper,
                   y = language, yend = language)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray") + 
  scale_colour_solarized(name = "", guide = FALSE) +
  scale_y_discrete(name = "", limits = levels(area_summary_ordered$language)) +
  xlab("\nRelative representation in early vocabulary")

Plot each lexical category’s area estimate by language and measure for WS only.

ggplot(filter(area_summary_ordered, form == "WS"),
       aes(y = language, x = mean, col = lexical_category)) +
  facet_grid(. ~ lexical_category) +
  geom_point() +
  geom_segment(aes(x = ci_lower, xend = ci_upper,
                   y = language, yend = language)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray") + 
  scale_colour_solarized(name = "", guide = FALSE) +
  scale_y_discrete(name = "", limits = levels(area_summary_ordered$language)) +
  xlab("\nRelative representation in early vocabulary")

Demo plots of English and Mandarin data and models.

demo_langs <- c("English", "Mandarin")
demo_data <- filter(vocab_comp_data, form == "WS", language %in% demo_langs) %>%
  mutate(panel = paste(language, "(data)"),
         lexical_category = factor(lexical_category,
                                   levels = c("nouns", "predicates", "function_words"),
                                   labels = c("Nouns", "Predicates", "Function Words")))
pts <- seq(0, 1, 0.01)

models <- demo_data %>%
  group_by(language, lexical_category) %>%
  do(model = clm(prop ~ I(vocab ^ 3) + I(vocab ^ 2) + vocab - 1, data = .))

get_lang_lexcat_predictions <- function(lang, lexcat) {
  model <- filter(models, language == lang, lexical_category == lexcat)$model[[1]]
  data.frame(vocab = pts,
             prop = predict(model, newdata = data.frame(vocab = pts)),
             lexical_category = lexcat,
             language = lang)
}

get_lang_predictions <- function(lang) {
  bind_rows(sapply(unique(demo_data$lexical_category),
                   function(lexcat) get_lang_lexcat_predictions(lang, lexcat),
                   simplify = FALSE))
}

predictions <- bind_rows(sapply(demo_langs, get_lang_predictions, simplify = FALSE))

diagonal <- expand.grid(vocab = rep(rev(pts)),
                        language = demo_langs,
                        lexical_category = unique(demo_data$lexical_category))
diagonal$prop <- diagonal$vocab

area_poly <- bind_rows(predictions, diagonal) %>%
  mutate(panel = paste(language, "(models)"))
ggplot(demo_data,
       aes(x = vocab, y = prop, colour = lexical_category, fill = lexical_category)) +
  facet_grid(~ panel) +
  geom_point(size = 0.7) +
  geom_polygon(data = area_poly, alpha = 0.2) +
  geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") + 
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "Proportion of Category\n") +
  scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "\nVocabulary Size") +
  scale_color_solarized(guide = FALSE) +
  scale_fill_solarized(name = "") +
  theme(legend.position = c(0.061, 0.91),
        legend.text = element_text(size = 8),
        legend.key.height = unit(0.9, "char"),
        legend.key.width = unit(0.88, "char"),
        legend.background = element_rect(fill = "transparent"))